Processing and visualizing 89k cells from Packer et al. 2019 C. elegans 10xv2 single cell data
Original article:
A lineage-resolved molecular atlas of C. elegans embryogenesis at single-cell resolution
https://science.sciencemag.org/content/365/6459/eaax1971.long
The anndata object we provide has 89701 cells and 20222 genes. It includes short gene descriptions from WormBase that will show up when mousing over the interactive plots.
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scvi.dataset import GeneExpressionDataset
from scvi.models import VAE
from scvi.inference import UnsupervisedTrainer
import torch
import anndata
import plotly.express as px
import plotly.graph_objects as go
from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger
# Change the path where the models will be saved
save_path = "./"
vae_file_name = 'worm_vae.pkl'
if os.path.isfile('packer.h5ad'):
print ("Found the data file! No need to download.")
else:
print ("Downloading data...")
! wget https://github.com/Munfred/worm-notebooks/releases/download/Packer/packer.h5ad
adata = anndata.read('packer.h5ad')
adata
adata.X
The gene descriptions were taken using the WormBase API.
display(adata.var.head().style.set_properties(subset=['gene_description'], **{'width': '600px'}))
adata.obs.head().T
We load the Packer data and use the batch annotations for scVI. Here, each experiment correspond to a batch.
gene_dataset = GeneExpressionDataset()
# we provide the `batch_indices` so that scvi can perform batch correction
gene_dataset.populate_from_data(
adata.X,
gene_names=adata.var.index.values,
cell_types=adata.obs['cell_type'].values,
batch_indices=adata.obs['batch'].cat.codes.values,
)
# We select the 1000 most variable genes, which is the default selection criteria of scvi
gene_dataset.subsample_genes(1000)
sel_genes = gene_dataset.gene_names
adata.obs['cell_type'].values
# for this dataset 5 epochs is sufficient
n_epochs = 5
lr = 1e-3
use_cuda = False
We now create the model and the trainer object. We train the model and output model likelihood every epoch. In order to evaluate the likelihood on a test set, we split the datasets (the current code can also so train/validation/test).
If a pre-trained model already exist in the save_path then load the same model rather than re-training it. This is particularly useful for large datasets.
# set the VAE to perform batch correction
vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches)
trainer = UnsupervisedTrainer(
vae,
gene_dataset,
train_size=0.75, # number between 0 and 1, default 0.8
use_cuda=use_cuda,
frequency=1,
)
# check if a previously trained model already exists, if yes load it
full_file_save_path = os.path.join(save_path, vae_file_name)
if os.path.isfile(full_file_save_path):
trainer.model.load_state_dict(torch.load(full_file_save_path))
trainer.model.eval()
else:
trainer.train(n_epochs=n_epochs, lr=lr)
torch.save(trainer.model.state_dict(), full_file_save_path)
train_test_results = pd.DataFrame(trainer.history).rename(columns={'elbo_train_set':'Train', 'elbo_test_set':'Test'})
train_test_results
ax = train_test_results.plot()
ax.set_xlabel("Epoch")
ax.set_ylabel("Error")
plt.show()
The posterior object contains a model and a gene_dataset, as well as additional arguments that for Pytorch's DataLoader. It also comes with many methods or utilities querying the model, such as differential expression, imputation and differential analyisis.
To get an ordered output result, we might use .sequential posterior's method which return another instance of posterior (with shallow copy of all its object references), but where the iteration is in the same ordered as its indices attribute.
full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset)))
latent, batch_indices, labels = full.sequential().get_latent()
batch_indices = batch_indices.ravel()
# scvi tutorial latent space code
full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset)))
latent, batch_indices, labels = full.sequential().get_latent()
batch_indices = batch_indices.ravel()
latent.shape
# store the latent space in a new anndata object
post_adata = anndata.AnnData(X=gene_dataset.X)
post_adata.obs=adata.obs
post_adata.obsm["latent"] = latent
post_adata
# here's a hack to randomize categorical colors, since plotly can't do that in a straightforward manner
# we take the list of named css colors that it recognizes, and we picked a color based on the code of
# the cluster we are coloring
css_colors=[
'aliceblue','antiquewhite','aqua','aquamarine','azure','bisque','black','blanchedalmond','blue',
'blueviolet','brown','burlywood','cadetblue','chartreuse','chocolate','coral','cornflowerblue',
'crimson','cyan','darkblue','darkcyan','darkgoldenrod','darkgray','darkgrey','darkgreen','darkkhaki',
'darkmagenta','darkolivegreen','darkorange','darkorchid','darkred','darksalmon','darkseagreen',
'darkslateblue','darkslategray','darkslategrey','darkturquoise','darkviolet','deeppink','deepskyblue',
'dimgray','dimgrey','dodgerblue','firebrick','floralwhite','forestgreen','fuchsia','gainsboro','ghostwhite',
'gold','goldenrod','gray','grey','green','greenyellow','honeydew','hotpink','indianred','indigo',
'ivory','khaki','lavender','lavenderblush','lawngreen','lemonchiffon','lightblue','lightcoral','lightcyan',
'lightgoldenrodyellow','lightgray','lightgrey','lightgreen','lightpink','lightsalmon','lightseagreen',
'lightskyblue','lightslategray','lightslategrey','lightsteelblue','lightyellow','lime','limegreen','linen',
'magenta','maroon','mediumaquamarine','mediumblue','mediumorchid','mediumpurple','mediumseagreen',
'mediumslateblue','mediumspringgreen','mediumturquoise','mediumvioletred','midnightblue','mintcream',
'mistyrose','moccasin','navajowhite','navy','oldlace','olive','olivedrab','orange','orangered','orchid',
'palegoldenrod','palegreen','paleturquoise','palevioletred','papayawhip','peachpuff','peru','pink','plum'
,'powderblue','purple','red','rosybrown','royalblue','saddlebrown','salmon','sandybrown','seagreen',
'seashell','sienna','silver','skyblue','slateblue','slategray','slategrey','snow','springgreen','steelblue',
'tan','teal','thistle','tomato','turquoise','violet','wheat','white','whitesmoke','yellow','yellowgreen']
# we just repeat the list of colors a bunch of times to ensure we always have more colors than clusters
css_colors = css_colors*100
# now define a function to plot any embedding
def plot_embedding(embedding_kind, # the embedding must be a label in the post_adata.obsm
adata=adata, # the original adata for taking the cluster labels
post_adata=post_adata,
cluster_feature ='cell_type',
xlabel="Dimension 1",
ylabel="Dimension 2",
plot_title="Embedding on single cell data"):
# `cluster_feature` should be the name of one of the categorical annotation columns
# e.g. `cell_type`, `cell_subtype`, `time_point`
cluster_ids = adata.obs[cluster_feature].cat.codes.unique()
id_to_cluster_map = dict( zip( adata.obs[cluster_feature].cat.codes, adata.obs[cluster_feature] ) )
cluster_to_id_map = dict([[v,k] for k,v in id_to_cluster_map.items()])
fig = go.Figure()
for _cluster_id in adata.obs[cluster_feature].cat.codes.unique():
fig.add_trace(
go.Scattergl(
x = post_adata.obsm[embedding_kind][:,0][post_adata.obs[cluster_feature].cat.codes==_cluster_id]
, y = post_adata.obsm[embedding_kind][:,1][post_adata.obs[cluster_feature].cat.codes==_cluster_id]
, mode='markers'
, marker=dict(
# we randomize colors by starting at a random place in the list of named css colors
color=css_colors[_cluster_id+np.random.randint(0,len(np.unique(css_colors)))]
, size = 3
)
, showlegend=True
, name=id_to_cluster_map[_cluster_id]
, hoverinfo=['name']
)
)
layout={
"title": {"text": plot_title
, 'x':0.5
}
, 'xaxis': {'title': {"text": xlabel}}
, 'yaxis': {'title': {"text": ylabel}}
, "height": 1000
, "width":1400
}
fig.update_layout(layout)
fig.update_layout(showlegend=True)
return fig
tsne = TSNE(callbacks=ErrorLogger(),
initialization='random',
negative_gradient_method='fft',
callbacks_every_iters=200,
n_iter=1400,
neighbors='approx')
tsne_embedding = tsne.fit(latent)
post_adata.obsm["tsne"] = np.array(tsne_embedding)
print('t-SNE embedding done!')
fig = plot_embedding(embedding_kind='tsne',
cluster_feature ='cell_type',
xlabel="t-SNE 1",
ylabel="t-SNE 2",
plot_title="t-SNE on scVI latent space for Packer C. elegans single cell data")
# uncomment this line to save an interactive html plot, in case it is not rendering
# fig.write_html('worms_interactive_tsne.html')
fig.show()
Note: scVI recently introduced a second way to perform DE, and some functions and documentation are still changing.
The old mode is called vanilla and is performed here. The new mode is called change and is not done in this notebook.
vanilla mode, DE is measured by logit(p/(1-p)) where p is the probability of a cell from population A having a higher expression than a cell from population B. Vanilla DE mode (performed in this notebook)¶Explanation adapted from the scVi differential_expression_score docstring.
The vanilla mode follows protocol described in Lopez et al, arXiv:1709.02082.
In this case for a given gene we perform hypothesis testing based on latent variable in the generative model that models the mean of the gene expression.
We are comparing $h_{1g}$, the mean expression of gene $g$ in cell type 1, with $h_{2g}$, the mean expression of $g$ in cell type 2.
The hypotheses are:
$$ M^g_1: h_{1g} > h_{2g} $$$$ M^g_2: h_{1g} \leq h_{2g} $$DE between cell types 1 and 2 for each gene can then be based on the Bayes factors:
$$ \text{Natural Log Bayes Factor for gene g in cell types 1 and 2} = \ln ( {BF^g_{12}) = \ln(\frac{ p(M^g_1 | x_1, x_2)}{p(M^g_2 | x_1, x_2)}}) $$Note that the scvi differential_expression_score returns the natural logarithm of the Bayes Factor.
This is $\ln(BF_{10})$ in the table discussed below.
To compute the gene specific Bayes factors using masks idx1 and idx2 we sample the Posterior in the following way:
The posterior is sampled n_samples times for each subpopulation
For computation efficiency (posterior sampling is quite expensive), instead of comparing element-wise the obtained samples, we can permute posterior samples.
Remember that computing the Bayes Factor requires sampling $q(z_A | x_A)$ and $q(z_B | x_B)$
To learn more about Bayes factors vs. p-values, see the review On p-Values and Bayes Factors by Leonhard Held and Manuela Ott.
For a shorter overview, see this blog post.
A common interpretation table is copied below.
In our notation, $BF_{10}$ is $BF^g_{12}$, $H_0$ is $M^g_1$ and $H_1$ is $M^g_2$
| Bayes factor $BF_{10}$ | $\ln(BF_{10})$ | Interpretation |
|---|---|---|
| > 100 | > 4.60 | Extreme evidence for H1 |
| 30 – 100 | (3.4, 4.6) | Very strong evidence for H1 |
| 10 – 30 | (2.3, 3.4) | Strong evidence for H1 |
| 3 – 10 | (1.1, 2.3) | Moderate evidence for H1 |
| 1 – 3 | (0 , 1.1) | Anecdotal evidence for H1 |
| 1 | 0 | No evidence |
| 1/3 – 1 | (-1.1, 0) | Anecdotal evidence for H0 |
| 1/3 – 1/10 | (-2.30, -1.1) | Moderate evidence for H0 |
| 1/10 – 1/30 | (-3.4, -2.30) | Strong evidence for H0 |
| 1/30 – 1/100 | (-4.6, -3.4) | Very strong evidence for H0 |
| < 1/100 | < -4.6 | Extreme evidence for H0 |
# let's take a look at abundances of different cell types
adata.obs['cell_type'].value_counts()
# let's pick two cell types
cell_type_1 = 'Ciliated_non_amphid_neuron'
cell_type_2 = 'Intestine'
cell_idx1 = adata.obs['cell_type'] == cell_type_1
print(sum(cell_idx1), 'cells of type', cell_type_1)
cell_idx2 = adata.obs['cell_type'] == cell_type_2
print(sum(cell_idx2), 'cells of type', cell_type_2)
n_samples: the number of times to sample the posterior gene frequencies from the vae model for each gene in each cell.M_permutation: Number of pairs sampled for comparison.idx1: boolean array masking subpopulation cells 1. (True where cell is from population) idx2: boolean array masking subpopulation cells 2. (True where cell is from population) n_samples = 10000
M_permutation = 10000
de_res = full.differential_expression_score(
cell_idx1,
cell_idx2,
n_samples=n_samples,
M_permutation=M_permutation,
)
bayesi: The bayes factor for cell type 1 having a higher expression than cell type 2bayesi_permuted: estimate Bayes Factors of random populations of the union of the two cell populationsmeani: average UMI counts in cell type inonzi: proportion of non-zero expression in cell type inorm_meani: average UMI counts in cell type i normalized by cell sizescalei: average scVI imputed gene expression scale in cell type ide_res.head()
# manipulate the DE results for plotting
de = de_res.copy()
# we compute the ratio of the scVI scales to use that as a rough proxy for fold change
de['ratio_scale12']=de['scale1']/de['scale2']
de['log_scale_ratio']=np.log2(de['ratio_scale12'])
# we take absolute values of the first bayes factor as the one to use on the volcano plot
# bayes1 and bayes2 should be roughtly the same, except with opposite signs
de['abs_bayes_factor']=np.abs(de['bayes1'])
#make a copy of the annotated gene metadata with gene ids all lower case to avoid problems when joining dataframes
adata_var_uppercase = adata.var.copy()
adata_var_uppercase.index = adata_var_uppercase.index.str.upper()
#convert top_expression gene ids index to lowercase for joining with metadata
de=de.join(adata_var_uppercase, how='inner')
de.head().T
Because we're using the vanilla mode, note that this volcano plot shows the ratios of the scVI expression scale of the two tissues vs the absolute value of the natural log bayes factor.
The new change mode allows for calculating log fold change and p-values, which are more commonly seen in volcano plots
We can highlight genes of interest based on simple string matching. For example, the cell below highlights all C. elegans neuropeptides (whose name conveniently all start with nlp, ins or flp). Other genes with be a transparent gray dot.
de['gene_color'] = 'rgba(100, 100, 100, 0.25)'
de['gene_color'][de['gene_name'].str.contains('ins-')] = 'rgba(255, 0,0, 1)'
de['gene_color'][de['gene_name'].str.contains('flp-')] = 'rgba(0, 255,0, 1)'
de['gene_color'][de['gene_name'].str.contains('nlp-')] = 'rgba(0, 0,255, 1)'
# first we create these variables to customize the hover text in plotly's heatmap
# the text needs to be arranged in a matrix the same shape as the heatmap
# for the gene descriptions text, which can be several sentences, we add a line break after each sentence
de['gene_description_html'] = de['gene_description'].str.replace('\. ', '.<br>')
fig = go.Figure(
data=go.Scatter(
x=de["log_scale_ratio"].round(3)
, y=de["abs_bayes_factor"].round(3)
, mode='markers'
, marker=dict(color=de['gene_color'])
, hoverinfo='text'
, text=de['gene_description_html']
, customdata=de.gene_id.values + '<br>Name: ' + de.gene_name.values
, hovertemplate='%{customdata} <br>' +
'|ln(BF)|: %{y}<br>' +
'Log2 scale ratio: %{x}' +
'<extra>%{text}</extra>'
)
, layout= {
"title": {"text":
"Differential expression on of Packer C. elegans single cell data between cell types <br> <b>" +
str(cell_type_1) + "</b> and <b>" + str(cell_type_2) + " "
, 'x':0.5
}
, 'xaxis': {'title': {"text": "Log2 of scVI expression scale"}}
, 'yaxis': {'title': {"text": "Absolute value of natural log of Bayes Factor"}}
}
)
# uncomment line below to save the interactive volcano plot as html
# fig.write_html('worms_interactive_volcano_plot.html')
fig.show()
Now we perform DE between each cell type vs all other cells and make a heatmap of the result.
First we need to make cell type sumary with numerical codes for each cell type
# we need to numerically encode the cell types for passing the cluster identity to scVI
cell_code_to_type = dict( zip( adata.obs['cell_type'].cat.codes, adata.obs['cell_type'] ) )
cell_type_to_code_map = dict([[v,k] for k,v in cell_code_to_type.items()])
# check that we got unique cell type labels
assert len(cell_code_to_type)==len(cell_type_to_code_map)
cell_types_summary=pd.DataFrame(index=adata.obs['cell_type'].value_counts().index)
cell_types_summary['cell_type_code']=cell_types_summary.index.map(cell_type_to_code_map)
cell_types_summary['ncells']=adata.obs['cell_type'].value_counts()
cell_types_summary['cell_type_name']=adata.obs['cell_type'].value_counts().index
cell_types_summary.to_csv('packer_cell_types_summary.csv')
cell_types_summary.head()
# create a column in the cell data with the cluster id each cell belongs to
adata.obs['cell_type_code'] = adata.obs['cell_type'].cat.codes
# this returns a list of dataframes with DE results (one for each cluster), and a list with the corresponding cluster id
per_cluster_de, cluster_id = full.one_vs_all_degenes(cell_labels=adata.obs['cell_type_code'].ravel(), min_cells=1)
# pick the top 10 genes in each cluster
top_genes = []
for x in per_cluster_de:
top_genes.append(x[:10])
top_genes = pd.concat(top_genes)
top_genes = np.unique(top_genes.index)
# fetch the expression values for the top 10 genes
top_expression = [x.filter(items=top_genes, axis=0)['norm_mean1'] for x in per_cluster_de]
top_expression = pd.concat(top_expression, axis=1)
top_expression = np.log10(1 + top_expression)
cluster_name = [cell_code_to_type[_id] for _id in cluster_id]
top_expression.columns=cluster_name
# convert into anndata object to tie with more metadata, such as gene names and descriptions
top_expression = anndata.AnnData(top_expression.T)
top_expression.obs = top_expression.obs.join(cell_types_summary)
top_expression.obs.head()
#make a copy of the annotated gene metadata with gene ids all lower case to avoid problems when joining dataframes
adata_var_lowcase = adata.var.copy()
adata_var_lowcase.index = adata_var_lowcase.index.str.lower()
#convert top_expression gene ids index to lowercase for joining with metadata
top_expression.var.index = top_expression.var.index.str.lower()
top_expression.var=top_expression.var.join(adata_var_lowcase)
top_expression.var.index=top_expression.var['gene_id']
top_expression.var.head().style.set_properties(subset=['gene_description'], **{'width': '600px'})
In this heatmap we plot the top 10 genes for each of the 37 annotated tissue types for a total of 242 genes.
# first we create these variables to customize the hover text in plotly's heatmap
# the text needs to be arranged in a matrix the same shape as the heatmap
# for the gene descriptions text, which can be several sentences, we add a line break after each sentence
top_expression.var['gene_description_html'] = top_expression.var['gene_description'].str.replace('\. ', '.<br>')
gene_description_text_matrix = np.tile(top_expression.var['gene_description_html'].values, (len(top_expression.obs['cell_type_name']),1) )
gene_ids_text_matrix = np.tile(top_expression.var['gene_id'].values, (len(top_expression.obs['cell_type_name']),1))
# now create the heatmap with plotly
fig = go.Figure(
data=go.Heatmap(
z=top_expression.X.round(3),
x=top_expression.var['gene_name'],
y=top_expression.obs['cell_type_name'],
hoverinfo='text',
text=gene_description_text_matrix,
customdata=gene_ids_text_matrix,
hovertemplate='%{customdata} <br>Name: %{x}<br>Cell type: %{y}<br>Expression: %{z} <extra>%{text}</extra>',
),
layout= {
"title": {"text": "Differential expression of Packer C. elegans single cell data"},
"height": 800,
},
)
# uncomment line below to save the interactive volcano plot as html
# fig.write_html('worms_interactive_heatmap.html')
fig.show()